Student name and ID: Zeinab Khansari, 501076000
Supervisor: Dr. Tamer Abdou
# Installing libraries and setting directory
library(lubridate)
library(EnvStats)
library(tidyr)
library(pillar)
library(stringi)
library(readr)
library(tidyverse)
library(purrr)
library(plyr)
library(tibble)
library(rlang)
library(readxl)
library(anytime)
library(ggplot2)
library(devtools)
library(ggpubr)
library(reshape2)
library(dplyr)
library(Hmisc)
library(outliers)
library(magrittr)
library(mlbench)
library(caret)
library(stats)
library(corrplot)
library(TH.data)
library(Boruta)
library(MTS)
library(corrplot)
library(mice)
library(VIM)
library(missRanger)
library(caTools)
library(FSinR)
library(olsrr)
library(knitr)
options(stringAsFactors=FALSE, warnings=FALSE)
setwd("/Users/zeinabkhansari/Desktop/Capstone")d1 <- read_excel("/Users/zeinabkhansari/Desktop/Capstone/MISA_2004.xls")
d2 <- read_excel("/Users/zeinabkhansari/Desktop/Capstone/MISA_2005.xls")
d3 <- read_excel("/Users/zeinabkhansari/Desktop/Capstone/MISA_2006.xls")
d4 <- read_excel("/Users/zeinabkhansari/Desktop/Capstone/MISA_2007.xls")
d5 <- read_excel("/Users/zeinabkhansari/Desktop/Capstone/MISA_2008.xls")
d6 <- read_excel("/Users/zeinabkhansari/Desktop/Capstone/MISA_2009.xls")
d7 <- read_excel("/Users/zeinabkhansari/Desktop/Capstone/MISA_2010.xls")
d8 <- read_excel("/Users/zeinabkhansari/Desktop/Capstone/MISA_2011.xls")
d9 <- read_excel("/Users/zeinabkhansari/Desktop/Capstone/MISA_2012.xls")
d10 <- read_excel("/Users/zeinabkhansari/Desktop/Capstone/MISA_2013.xlsx")
d11 <- read_excel("/Users/zeinabkhansari/Desktop/Capstone/MISA_2014.xlsx")
d12 <- read_excel("/Users/zeinabkhansari/Desktop/Capstone/MISA_2015.xlsx")
d13 <- read_excel("/Users/zeinabkhansari/Desktop/Capstone/MISA_2016.xlsx")
d14 <- read_excel("/Users/zeinabkhansari/Desktop/Capstone/MISA_2017.xlsx")
d15 <- read_excel("/Users/zeinabkhansari/Desktop/Capstone/MISA_2018.xlsx")
d16 <- read_excel("/Users/zeinabkhansari/Desktop/Capstone/MISA_2019.xlsx")# Let's take a look at each individual dataset (excel file)
# removing extra column from d1
d1 <- d1[,-16]
# d9 has only month and year with no date
# adding date to month and year to uniform the date format
d9$DAY <- "01"
d9$Date <- as.Date(with(d9, paste(YEAR, MONTH, DAY,sep="-")), "%Y-%m-%d")
# names(d1)
# names(d2)
# names(d3)
# columns' names are not consistent throughout the excel files
# changing names of some columns to keep them consistent with others
colnames(d3)[5] <- "Date"
colnames(d3)[1] <- "Sector"
# names(d4)
# names(d5)
# names(d6)
# names(d7)
# names(d8)
# renaming d8 attributes to keep the consistency throughout the whole dataset
colnames(d8)[1] <- "Sector"
colnames(d8)[2] <- "Works Name"
colnames(d8)[3] <- "Company Code"
colnames(d8)[4] <- "Municipality"
colnames(d8)[5] <- "Date"
colnames(d8)[6] <- "Control Point ID"
colnames(d8)[7] <- "Control Point Name"
colnames(d8)[8] <- "Parameter Name"
colnames(d8)[9] <- "Parameter Reported As"
colnames(d8)[10] <- "Result Structure"
colnames(d8)[11] <- "Component Type"
colnames(d8)[12] <- "Frequency"
colnames(d8)[13] <- "Value"
colnames(d8)[14] <- "Unit of Measure"
colnames(d8)[15] <- "Regulation"
# also, the order of columns differ throughout the dataset between excel files
# changing the order of columns to make them comparable and consistent with the rest of the dataset
d8 <- d8[,c(1, 2, 3, 4, 5, 7, 6, 8, 9, 12, 10, 11, 13, 14, 15)]
# names(d9)
# renaming the d9 attributes
colnames(d9)[1] <- "Sector"
colnames(d9)[2] <- "Works Name"
colnames(d9)[3] <- "Company Code"
colnames(d9)[4] <- "Municipality"
colnames(d9)[7] <- "Control Point ID"
colnames(d9)[8] <- "Control Point Name"
colnames(d9)[9] <- "Parameter Name"
colnames(d9)[10] <- "Parameter Reported As"
colnames(d9)[11] <- "Result Structure"
colnames(d9)[12] <- "Component Type"
colnames(d9)[15] <- "Unit of Measure"
# changing the columns' order
d9 <- d9[, -c(5, 6, 17)]
d9 <- d9[, c(1, 2, 3, 4, 15, 6, 5, 7, 8, 11, 9, 10, 12, 13, 14)]
colnames(d9)[10] <- "Frequency"
colnames(d9)[13] <- "Value"
colnames(d9)[15] <- "Regulation"
# names(d10)
# changing the names
colnames(d10)[1] <- "Sector"
colnames(d10)[5] <- "Date"
# names(d11)
colnames(d11)[5] <- "Date"
colnames(d11)[1] <- "Sector"
# names(d12)
colnames(d12)[5] <- "Date"
colnames(d12)[1] <- "Sector"
# names(d13)
colnames(d13)[5] <- "Date"
colnames(d13)[1] <- "Sector"
# names(d14)
# names(d15)
# names(d16)
colnames(d14)[1] <- "Sector"
colnames(d14)[2] <- "Works Name"
colnames(d14)[3] <- "Company Code"
colnames(d14)[4] <- "Municipality"
colnames(d14)[5] <- "Date"
colnames(d14)[6] <- "Control Point ID"
colnames(d14)[7] <- "Control Point Name"
colnames(d14)[8] <- "Parameter Name"
colnames(d14)[9] <- "Parameter Reported As"
colnames(d14)[10] <- "Frequency"
colnames(d14)[11] <- "Result Structure"
colnames(d14)[12] <- "Component Type"
colnames(d14)[13] <- "Value"
colnames(d14)[14] <- "Unit of Measure"
colnames(d14)[15] <- "Regulation"
colnames(d15)[1] <- "Sector"
colnames(d15)[2] <- "Works Name"
colnames(d15)[3] <- "Company Code"
colnames(d15)[4] <- "Municipality"
colnames(d15)[5] <- "Date"
colnames(d15)[6] <- "Control Point ID"
colnames(d15)[7] <- "Control Point Name"
colnames(d15)[8] <- "Parameter Name"
colnames(d15)[9] <- "Parameter Reported As"
colnames(d15)[10] <- "Frequency"
colnames(d15)[11] <- "Result Structure"
colnames(d15)[12] <- "Component Type"
colnames(d15)[13] <- "Value"
colnames(d15)[14] <- "Unit of Measure"
colnames(d15)[15] <- "Regulation"
colnames(d16)[1] <- "Sector"
colnames(d16)[2] <- "Works Name"
colnames(d16)[3] <- "Company Code"
colnames(d16)[4] <- "Municipality"
colnames(d16)[5] <- "Date"
colnames(d16)[6] <- "Control Point ID"
colnames(d16)[7] <- "Control Point Name"
colnames(d16)[8] <- "Parameter Name"
colnames(d16)[9] <- "Parameter Reported As"
colnames(d16)[10] <- "Frequency"
colnames(d16)[11] <- "Result Structure"
colnames(d16)[12] <- "Component Type"
colnames(d16)[13] <- "Value"
colnames(d16)[14] <- "Unit of Measure"
colnames(d16)[15] <- "Regulation"
# adjusting the date data format throughout the whole dataset
d1$Date <- anydate(d1$Date)
d2$Date <- anydate(d2$Date)
d3$Date <- anydate(d3$Date)
d4$Date <- anydate(d4$Date)
d5$Date <- anydate(d5$Date)
d6$Date <- anydate(d6$Date)
d7$Date <- anydate(d7$Date)
d8$Date <- anydate(d8$Date)
d9$Date <- anydate(d9$Date)
d10$Date <- anydate(d10$Date)
d11$Date <- anydate(d11$Date)
d12$Date <- anydate(d12$Date)
d13$Date <- anydate(d13$Date)
d14$Date <- anydate(d14$Date)
d15$Date <- anydate(d15$Date)
d16$Date <- anydate(d16$Date)
# changing the data class from character and binary to numeric
d1$Value <- as.numeric(d1$Value)
d2$Value <- as.numeric(d2$Value)
d3$Value <- as.numeric(d3$Value)
d4$Value <- as.numeric(d4$Value)
d5$Value <- as.numeric(d5$Value)
d6$Value <- as.numeric(d6$Value)
d7$Value <- as.numeric(d7$Value)
d8$Value <- as.numeric(d8$Value)
d9$Value <- as.numeric(d9$Value)
d10$Value <- as.numeric(d10$Value)
d11$Value <- as.numeric(d11$Value)
d12$Value <- as.numeric(d12$Value)
d13$Value <- as.numeric(d13$Value)
d14$Value <- as.numeric(d14$Value)
d15$Value <- as.numeric(d15$Value)
d16$Value <- as.numeric(d16$Value)
# phewwww! now all excel files can get merged together safely! :D
# merging all data into two dataset -- dd will be the dataset for further analysis and creating the model
dd <- bind_rows(d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15, d16)
names(dd)## [1] "Sector" "Works Name" "Company Code"
## [4] "Municipality" "Date" "Control Point Name"
## [7] "Control Point ID" "Parameter Name" "Parameter Reported As"
## [10] "Frequency" "Result Structure" "Component Type"
## [13] "Value" "Unit of Measure" "Regulation"
# counting the number of NAs in the dataset
cat("The total counts for NAs: ", sum(is.na(dd)), "\n")## The total counts for NAs: 66990
cat("The total number of non-NAs: ", sum(!is.na(dd)), "\n")## The total number of non-NAs: 8956530
cat("The ratio of NAs/Total observations: ", sum(is.na(dd))/(sum(is.na(dd))+sum(!is.na(dd))))## The ratio of NAs/Total observations: 0.007423932
# eliminating the NAs
dd <- drop_na(dd)Since NAs are less than one percent of the total count of observations, I’ve decided to remove them instead of replacing them with mean, interpolation, etc.
str(dd)## tibble [534,578 × 15] (S3: tbl_df/tbl/data.frame)
## $ Sector : chr [1:534578] "INORGANIC CHEMICALS" "INORGANIC CHEMICALS" "INORGANIC CHEMICALS" "INORGANIC CHEMICALS" ...
## $ Works Name : chr [1:534578] "GENERAL CHEMICAL CANADA LTD. (AMHERSTBURG)" "GENERAL CHEMICAL CANADA LTD. (AMHERSTBURG)" "GENERAL CHEMICAL CANADA LTD. (AMHERSTBURG)" "GENERAL CHEMICAL CANADA LTD. (AMHERSTBURG)" ...
## $ Company Code : chr [1:534578] "0000010009" "0000010009" "0000010009" "0000010009" ...
## $ Municipality : chr [1:534578] "AMHERSTBURG" "AMHERSTBURG" "AMHERSTBURG" "AMHERSTBURG" ...
## $ Date : Date[1:534578], format: "2004-01-01" "2004-01-01" ...
## $ Control Point Name : chr [1:534578] "PLANT - PROCESS EFFLUENT" "PLANT - PROCESS EFFLUENT" "PLANT - PROCESS EFFLUENT" "PLANT - PROCESS EFFLUENT" ...
## $ Control Point ID : chr [1:534578] "0700" "0700" "0700" "0700" ...
## $ Parameter Name : chr [1:534578] "AMMONIUM+AMMONIA, TOTAL FILTER.REAC" "AMMONIUM+AMMONIA, TOTAL FILTER.REAC" "AMMONIUM+AMMONIA, TOTAL FILTER.REAC" "AMMONIUM+AMMONIA, TOTAL FILTER.REAC" ...
## $ Parameter Reported As: chr [1:534578] "AS NITROGEN" "AS NITROGEN" "AS NITROGEN" "AS NITROGEN" ...
## $ Frequency : chr [1:534578] "MONTHLY" "MONTHLY" "MONTHLY" "MONTHLY" ...
## $ Result Structure : chr [1:534578] "MISA MONTHLY REPORTING" "MISA MONTHLY REPORTING" "MISA MONTHLY REPORTING" "MISA MONTHLY REPORTING" ...
## $ Component Type : chr [1:534578] "AVERAGE" "MAXIMUM" "MINIMUM" "NUM. IN AVERAGE" ...
## $ Value : num [1:534578] 195.4 389.5 43.5 31 0 ...
## $ Unit of Measure : chr [1:534578] "KG/D" "KG/D" "KG/D" "KG/D" ...
## $ Regulation : chr [1:534578] "MISA COMPLIANCE" "MISA COMPLIANCE" "MISA COMPLIANCE" "MISA COMPLIANCE" ...
head(dd, 10)## # A tibble: 10 x 15
## Sector `Works Name` `Company Code` Municipality Date `Control Point …
## <chr> <chr> <chr> <chr> <date> <chr>
## 1 INORGA… GENERAL CHEM… 0000010009 AMHERSTBURG 2004-01-01 PLANT - PROCESS…
## 2 INORGA… GENERAL CHEM… 0000010009 AMHERSTBURG 2004-01-01 PLANT - PROCESS…
## 3 INORGA… GENERAL CHEM… 0000010009 AMHERSTBURG 2004-01-01 PLANT - PROCESS…
## 4 INORGA… GENERAL CHEM… 0000010009 AMHERSTBURG 2004-01-01 PLANT - PROCESS…
## 5 INORGA… GENERAL CHEM… 0000010009 AMHERSTBURG 2004-01-01 PLANT - PROCESS…
## 6 INORGA… GENERAL CHEM… 0000010009 AMHERSTBURG 2004-01-01 PLANT - PROCESS…
## 7 INORGA… GENERAL CHEM… 0000010009 AMHERSTBURG 2004-01-01 PLANT - PROCESS…
## 8 INORGA… GENERAL CHEM… 0000010009 AMHERSTBURG 2004-01-01 PLANT - PROCESS…
## 9 INORGA… GENERAL CHEM… 0000010009 AMHERSTBURG 2004-01-01 PLANT - PROCESS…
## 10 INORGA… GENERAL CHEM… 0000010009 AMHERSTBURG 2004-01-01 PLANT - PROCESS…
## # … with 9 more variables: Control Point ID <chr>, Parameter Name <chr>,
## # Parameter Reported As <chr>, Frequency <chr>, Result Structure <chr>,
## # Component Type <chr>, Value <dbl>, Unit of Measure <chr>, Regulation <chr>
# name of the parameters included in dataset
unique(dd$`Parameter Name`)## [1] "AMMONIUM+AMMONIA, TOTAL FILTER.REAC"
## [2] "ARSENIC, UNFILTERED TOTAL"
## [3] "CARBON, DISSOLVED ORGANIC"
## [4] "CHLORIDE, UNFIL.REAC"
## [5] "CHLOROFORM CHCL3"
## [6] "CYANIDE, AVAIL, UNFIL.REAC"
## [7] "FLOW"
## [8] "FLUORIDE, UNFILTERED REACTIVE"
## [9] "MERCURY, UNFILTERED TOTAL"
## [10] "MOLYBDENUM,UNFILTERED TOTAL"
## [11] "NITRATES TOTAL, FILTER.REAC"
## [12] "NITROGEN,TOT,KJELDAHL/UNF.REA"
## [13] "PHOSPHORUS,UNFILTERED TOTAL"
## [14] "RESIDUE, PARTICULATE"
## [15] "SOLVENT EXTRACTABLES"
## [16] "SULPHIDE, UNFILTERED REACTIVE"
## [17] "ALUMINIUM, UNFILTERED TOTAL"
## [18] "PHENOLICS, UNFILTERED REACTIVE"
## [19] "ZINC, UNFILTERED TOTAL"
## [20] "BENZENE C6H6"
## [21] "BROMOFORM"
## [22] "BROMOMETHANE"
## [23] "CHLORO METHANE"
## [24] "COBALT, UNFILTERED TOTAL"
## [25] "BENZO(A)PYRENE"
## [26] "LEAD, UNFILTERED TOTAL"
## [27] "NAPHTHALENE"
## [28] "RESIDUE,PART LOSS ON IGNIT."
## [29] "TOLUENE C7H8"
## [30] "VINYL CHLORIDE C2H3CL"
## [31] "HEXACHLOROBUTADIENE"
## [32] "CARBON TETRACHLORIDE"
## [33] "CHROMIUM, UNFILTERED TOTAL"
## [34] "COPPER, UNFILTERED TOTAL"
## [35] "HEXACHLOROBENZENE"
## [36] "TETRACHLOROETHYLENE"
## [37] "VANADIUM, UNFILTERED TOTAL"
## [38] "BIPHENYL"
## [39] "DIPHENYL ETHER"
## [40] "PCB TOTAL"
## [41] "BIOCHEMICAL OXYGEN DEMAND, 5 DAY, TOTAL DEMAND"
## [42] "PHENOL"
## [43] "ADSORBABLE ORGANIC HALIDE"
## [44] "ANTIMONY, UNFILTERED TOTAL"
## [45] "NICKEL, UNFILTERED TOTAL"
## [46] "BIS-2-CHLOROISOPROPYL ETHER"
## [47] "DICHLOROETHANE 1,2"
## [48] "DICHLOROPROPANE 1,2"
## [49] "ETHYLBENZENE C8H10"
## [50] "HEXACHLOROETHANE"
## [51] "METHYLENE CHLORIDE"
## [52] "TETRACHLOROBENZENE 1,2,4,5"
## [53] "TRICHLOROBENZENE 1,2,4"
## [54] "TRICHLOROETHYLENE C2HCL3"
## [55] "TRICHLOROTOLUENE 2,4,5"
## [56] "SULPHATE, UNFILTERED REACTIVE"
## [57] "IRON, UNFILTERED TOTAL"
## [58] "SILVER, UNFILTERED TOTAL"
## [59] "CADMIUM, UNFILTERED TOTAL"
## [60] "DICHLOROBENZENE 1,2"
## [61] "CHLORINE, FREE UNFIL.REAC"
## [62] "PH (-LOG H+ CONCN)"
## [63] "SPECIFIC CONDUCTANCE"
## [64] "2,3,7,8 - T4CDD"
## [65] "2,3,7,8 - T4CDF"
## [66] "TOTAL TOXIC EQUIVALENT"
Some of the above parameters are reported as the below parameters and most of them haven’t been reported as specific parameters. For the purpose of visualization, let’s take a look as “Parameter Reported As”:
# what are these parameters reported as
unique(dd$`Parameter Reported As`)## [1] "AS NITROGEN" "NOT APPLICABLE" "AS CARBON"
## [4] "AS HYDROGEN CYANIDE" "AS PHOSPHORUS" "AS SULPHIDE"
## [7] "AS PHENOL" "AS OXYGEN" "AS SULPHATE"
## [10] "AT 25 DEG. C" "NOT APPL" "AS N"
## [13] "AS O" "AS P" "AS C"
## [16] "AS HCN" "PHENOL" "AS SO4"
## [19] "AS H2S" "AS ARSENIC"
So, we have 66 parameters to work with! Some of them pointing to the same character such as O and OXYGEN, or C and CARBON, and some can be ignored (including NOT APPLICABLE, NOT APPL, AT 25 DEG. C)
# subset the waste reported as a target parameter to make sure using the correct attribute
unique(subset(dd$`Parameter Name`, dd$`Parameter Reported As`=="AS O"))## [1] "BIOCHEMICAL OXYGEN DEMAND, 5 DAY, TOTAL DEMAND"
unique(subset(dd$`Parameter Name`, dd$`Parameter Reported As`=="AS OXYGEN"))## [1] "BIOCHEMICAL OXYGEN DEMAND, 5 DAY, TOTAL DEMAND"
These two parameters point to the same thing and can be used interchangeably (needs to use or when needed to call any of them to include them both). Great! So, by choosing “Parameter Reported As” we’re on the right track!
# subset the waste reported as a target parameter to make sure using the correct attribute
unique(subset(dd$`Parameter Name`, dd$`Parameter Reported As`=="AS HCN"))## [1] "CYANIDE, AVAIL, UNFIL.REAC"
unique(subset(dd$`Parameter Name`, dd$`Parameter Reported As`=="AS HYDROGEN CYANIDE"))## [1] "CYANIDE, AVAIL, UNFIL.REAC"
So, these two “As Reported” factors are the same sort of waste too!
# subset the waste reported as a target parameter to make sure using the correct attribute
unique(subset(dd$`Parameter Name`, dd$`Parameter Reported As`=="AS H2S"))## [1] "SULPHIDE, UNFILTERED REACTIVE"
unique(subset(dd$`Parameter Name`, dd$`Parameter Reported As`=="AS SULPHIDE" ))## [1] "SULPHIDE, UNFILTERED REACTIVE"
Also, these two!
# subset the waste reported as a target parameter to make sure using the correct attribute
unique(subset(dd$`Parameter Name`, dd$`Parameter Reported As`=="AS SO4"))## [1] "SULPHATE, UNFILTERED REACTIVE"
unique(subset(dd$`Parameter Name`, dd$`Parameter Reported As`=="AS SULPHATE"))## [1] "SULPHATE, UNFILTERED REACTIVE"
And these two!
# what are waste reported as C and as CARBON
# subset the waste reported as a target parameter to make sure using the correct attribute
unique(subset(dd$`Parameter Name`, dd$`Parameter Reported As`=="AS CARBON"))## [1] "CARBON, DISSOLVED ORGANIC"
unique(subset(dd$`Parameter Name`, dd$`Parameter Reported As`=="AS C"))## [1] "CARBON, DISSOLVED ORGANIC"
SO, these two parameters point to the same thing and can be used interchangeably (needs to use or when needed to call any of them to include them both).
# the nature of factories/companies producing these wastes
unique(dd$`Sector`)## [1] "INORGANIC CHEMICALS" "METAL CASTING"
## [3] "ORGANIC CHEMICAL MANUFACTURING" "IRON AND STEEL"
## [5] "PETROLEUM REFINERIES" "PULP AND PAPER"
## [7] "INDUSTRIAL MINERALS" "METAL MINING AND REFINING"
## [9] "ELECTRIC POWER GENERATION"
# now let's take a look at geographical (minicipality-wise) distribution of waste
d_asO = subset(dd, dd$`Parameter Reported As`=="AS O" |dd$`Parameter Reported As`=="AS OXYGEN")
ggplot(data=d_asO, aes(y=d_asO$Value, x=d_asO$Municipality)) + geom_point(alpha = 0.6, aes(color = Sector))+ theme_bw() +ylab("BOD (Kg/Day)") +xlab("Municipality")+theme(axis.text.x=element_text(angle=90, hjust=1))
So, it seems that Terrace Bay, Thunder Bay, Espanola, and Fort Frances produce the most discharges in terms of BOD within their Pulp and paper industries!
I’ll do the same to visualize some other key industrial wastes across municipalities
# now let's take a look at geographical (minicipality-wise) distribution of waste
d_asC = subset(dd, dd$`Parameter Reported As`=="AS C" | dd$`Parameter Reported As`=="AS CARBON")
ggplot(data=d_asC, aes(y=d_asC$Value, x=d_asC$Municipality)) + geom_point(alpha = 0.6, aes(color = Sector))+ theme_bw() +ylab("Dissolved Carbon (Kg/Day)") +xlab("Municipality")+theme(axis.text.x=element_text(angle=90, hjust=1))
Using this strategy it’s clear that which municipality has to deal with what sort of industrial wastes as well as how much the problem of discharge can be an issue!
# now let's take a look at geographical (minicipality-wise) distribution of waste
d_asH2S = subset(dd, dd$`Parameter Reported As`=="AS H2S")
ggplot(data=d_asH2S, aes(y=d_asH2S$Value, x=d_asH2S$Municipality)) + geom_point(alpha = 0.6, aes(color = Sector))+ theme_bw() +ylab("H2S (Kg/Day)") +xlab("Municipality")+theme(axis.text.x=element_text(angle=90, hjust=1))Unfortunately, Sarnia produces a huge amount of H2S…
# now let's take a look at geographical (minicipality-wise) distribution of waste
d_asN = subset(dd, dd$`Parameter Reported As`=="AS N" | dd$`Parameter Reported As`=="AS NITROGEN")
ggplot(data=d_asN, aes(y=d_asN$Value, x=d_asN$Municipality)) + geom_point(alpha = 0.6, aes(color = Sector))+ theme_bw() +ylab("Dissolved Nitogen (Kg/Day)") +xlab("Municipality")+theme(axis.text.x=element_text(angle=90, hjust=1))
Seems like there is a serious outlier reported for Nitrogen at St. Clair TWP
# now let's take a look at geographical (minicipality-wise) distribution of waste
d_asC = subset(dd, dd$`Parameter Reported As`=="AS C" | dd$`Parameter Reported As`=="AS CARBON")
ggplot(data=d_asC, aes(y=d_asC$Value, x=d_asC$Municipality)) + geom_point(alpha = 0.6, aes(color = Sector))+ theme_bw() +ylab("Dissolved Carbon (Kg/Day)") +xlab("Municipality")+theme(axis.text.x=element_text(angle=90, hjust=1))Since there are different naming have been applied to the same sort of industrial waste, I rename them. This unifies the observations records and also reduces the number of factors; industrial discharge/chemicals in wastewater.
## [1] "X1" "NOT APPL" "X6" "X3" "X5" "X4"
## [7] "X8" "X2" "X7" "ARSENIC"
Very good! Now, let’s choose a name that makes more sence for each!
dd$`Parameter Reported As` <- gsub("X1", "NITROGEN", dd$`Parameter Reported As`)
dd$`Parameter Reported As` <- gsub("X2", "OXYGEN", dd$`Parameter Reported As`)
dd$`Parameter Reported As` <- gsub("X3", "HYDROGEN CYANIDE", dd$`Parameter Reported As`)
dd$`Parameter Reported As` <- gsub("X4", "H2S", dd$`Parameter Reported As`)
dd$`Parameter Reported As` <- gsub("X5", "PHOSPHORUS", dd$`Parameter Reported As`)
dd$`Parameter Reported As` <- gsub("X6", "CARBON", dd$`Parameter Reported As`)
dd$`Parameter Reported As` <- gsub("X7", "SO4", dd$`Parameter Reported As`)
dd$`Parameter Reported As` <- gsub("X8", "PHENOL", dd$`Parameter Reported As`)
unique(dd$`Parameter Reported As`)## [1] "NITROGEN" "NOT APPL" "CARBON" "HYDROGEN CYANIDE"
## [5] "PHOSPHORUS" "H2S" "PHENOL" "OXYGEN"
## [9] "SO4" "ARSENIC"
Great!
Now, let’s explore the outliers for each reported parameter using Z-Score:
# for NITROGEN
a <- subset(dd$Value, dd$`Parameter Reported As`=="NITROGEN")
z <- a %>% scores(type ="z")
outliers <- a[which(abs(z)>3)]
b <- subset(dd, dd$`Parameter Reported As`=="NITROGEN")
ggplot(data=b, aes(y= b$Value, color = Sector)) + geom_boxplot() +theme_bw() +ylab("value, kg/day") + xlab("Sector") + theme(axis.text.x = element_blank())
visualization helps to remove with confidence the max of outiers as it is only one record and the gap between other data points and that particular outlier, suggest that there was a mistake putting that number in there.
dd <- subset(dd, dd$Value !="70804.3")Now, let’s explore the Nitrogen outliers again
# for NITROGEN
a <- subset(dd$Value, dd$`Parameter Reported As`=="NITROGEN")
z <- a %>% scores(type ="z")
outliers_NITROGEN <- a[which(abs(z)>3)]
b <- subset(dd, dd$`Parameter Reported As`=="NITROGEN")
ggplot(data=b, aes(y= b$Value, color = Sector)) + geom_boxplot() +theme_bw() +ylab("value, kg/day") +xlab("Sector") + theme(axis.text.x = element_blank())Awesome! Now it makes more sense!
# for CARBON
a <- subset(dd$Value, dd$`Parameter Reported As`=="CARBON")
z <- a %>% scores(type ="z")
outliers_CARBON <- a[which(abs(z)>3)]
b <- subset(dd, dd$`Parameter Reported As`=="CARBON")
ggplot(data=b, aes(y= b$Value, color = Sector)) + geom_boxplot() +theme_bw() +ylab("value, kg/day") +xlab("Sector") + theme(axis.text.x = element_blank())# for HYDROGEN CYANIDE
a <- subset(dd$Value, dd$`Parameter Reported As`=="HYDROGEN CYANIDE")
z <- a %>% scores(type ="z")
outliers_HYDROGENCYANIDE <- a[which(abs(z)>3)]
b <- subset(dd, dd$`Parameter Reported As`=="HYDROGEN CYANIDE")
ggplot(data=b, aes(y= b$Value, color = Sector)) + geom_boxplot() +theme_bw() +ylab("value, kg/day") +xlab("Sector") + theme(axis.text.x = element_blank())# for PHOSPHORUS
a <- subset(dd$Value, dd$`Parameter Reported As`=="PHOSPHORUS")
z <- a %>% scores(type ="z")
outliers_PHOSPHORUS <- a[which(abs(z)>3)]
b <- subset(dd, dd$`Parameter Reported As`=="PHOSPHORUS")
ggplot(data=b, aes(y= b$Value, color = Sector)) + geom_boxplot() +theme_bw() +ylab("value, kg/day") +xlab("Sector") + theme(axis.text.x = element_blank())# for H2S
a <- subset(dd$Value, dd$`Parameter Reported As`=="H2S")
z <- a %>% scores(type ="z")
outliers_H2S <- a[which(abs(z)>3)]
b <- subset(dd, dd$`Parameter Reported As`=="H2S")
ggplot(data=b, aes(y= b$Value, color = Sector)) + geom_boxplot() +theme_bw() +ylab("value, kg/day") +xlab("Sector") + theme(axis.text.x = element_blank())# for PHENOL
a <- subset(dd$Value, dd$`Parameter Reported As`=="PHENOL")
z <- a %>% scores(type ="z")
outliers_PHENOL <- a[which(abs(z)>3)]
b <- subset(dd, dd$`Parameter Reported As`=="PHENOL")
ggplot(data=b, aes(y= b$Value, color = Sector)) + geom_boxplot() +theme_bw() +ylab("value, kg/day") +xlab("Sector") + theme(axis.text.x = element_blank())# for OXYGEN
a <- subset(dd$Value, dd$`Parameter Reported As`=="OXYGEN")
z <- a %>% scores(type ="z")
outliers_OXYGEN <- a[which(abs(z)>3)]
b <- subset(dd, dd$`Parameter Reported As`=="OXYGEN")
ggplot(data=b, aes(y= b$Value, color = Sector)) + geom_boxplot() +theme_bw() +ylab("value, kg/day") +xlab("Sector") + theme(axis.text.x = element_blank())# for SO4
a <- subset(dd$Value, dd$`Parameter Reported As`=="SO4")
z <- a %>% scores(type ="z")
outliers_SO4 <- a[which(abs(z)>3)]
b <- subset(dd, dd$`Parameter Reported As`=="SO4")
ggplot(data=b, aes(y= b$Value, color = Sector)) + geom_boxplot() +theme_bw() +ylab("value, kg/day") +xlab("Sector") + theme(axis.text.x = element_blank())# for ARSENIC
a <- subset(dd$Value, dd$`Parameter Reported As`=="ARSENIC")
z <- a %>% scores(type ="z")
outliers_ARSENIC <- a[which(abs(z)>3)]
b <- subset(dd, dd$`Parameter Reported As`=="ARSENIC")
ggplot(data=b, aes(y= b$Value, color = Sector)) + geom_boxplot() +theme_bw() +ylab("value, kg/day") +xlab("Sector") + theme(axis.text.x = element_blank())# for FLOW
a <- subset(dd$Value, dd$`Parameter Name`=="FLOW")
z <- a %>% scores(type ="z")
outliers_FLOW <- a[which(abs(z)>3)]
b <- subset(dd, dd$`Parameter Name`=="FLOW")
ggplot(data=b, aes(y= b$Value, color = Sector)) + geom_boxplot() +theme_bw() +ylab("value, m3/day") +xlab("Sector") + theme(axis.text.x = element_blank())# for PH
a <- subset(dd$Value, dd$`Parameter Name`=="PH (-LOG H+ CONCN)" )
z <- a %>% scores(type ="z")
outliers_PH <- a[which(abs(z)>3)]
b <- subset(dd, dd$`Parameter Name`=="PH (-LOG H+ CONCN)" )
ggplot(data=b, aes(y= b$Value, color = Sector)) + geom_boxplot() +theme_bw() +ylab("pH") +xlab("Sector") + theme(axis.text.x = element_blank())# for CHLORIDE
a <- subset(dd$Value, dd$`Parameter Name`=="CHLORIDE, UNFIL.REAC" )
z <- a %>% scores(type ="z")
outliers_CHLORIDE<- a[which(abs(z)>3)]
b <- subset(dd, dd$`Parameter Name`=="CHLORIDE, UNFIL.REAC" )
ggplot(data=b, aes(y= b$Value, color = Sector)) + geom_boxplot() +theme_bw() +ylab("CHLORIDE, kg/day") +xlab("Sector") + theme(axis.text.x = element_blank())# for TOTAL TOXIC
a <- subset(dd$Value, dd$`Parameter Name`=="TOTAL TOXIC EQUIVALENT" )
z <- a %>% scores(type ="z")
outliers_TOXIC <- a[which(abs(z)>3)]
b <- subset(dd, dd$`Parameter Name`=="TOTAL TOXIC EQUIVALENT" )
ggplot(data=b, aes(y= b$Value, color = Sector)) + geom_boxplot() +theme_bw() +ylab("TOTAL TOXIC, kg/day") +xlab("Sector") + theme(axis.text.x = element_blank())# for FLUORIDE
a <- subset(dd$Value, dd$`Parameter Name`=="FLUORIDE, UNFILTERED REACTIVE" )
z <- a %>% scores(type ="z")
outliers_FLUORIDE <- a[which(abs(z)>3)]
b <- subset(dd, dd$`Parameter Name`=="FLUORIDE, UNFILTERED REACTIVE" )
ggplot(data=b, aes(y= b$Value, color = Sector)) + geom_boxplot() +theme_bw() +ylab("FLUORIDE, kg/day") +xlab("Sector") + theme(axis.text.x = element_blank())# for PARTICLE
a <- subset(dd$Value, dd$`Parameter Name`=="RESIDUE, PARTICULATE" )
z <- a %>% scores(type ="z")
outliers_PARTICLE <- a[which(abs(z)>3)]
b <- subset(dd, dd$`Parameter Name`=="RESIDUE, PARTICULATE" )
ggplot(data=b, aes(y= b$Value, color = Sector)) + geom_boxplot() +theme_bw() +ylab("PARTICLE, kg/day") +xlab("Sector") + theme(axis.text.x = element_blank())
## Decision about the existance of outliers
This visualization strategy helps to see which sector is producing what types of industrial discharge as well as have an understanding about the situations of the outliers!
Since these outliers come from site measurements and can reflect real spikes and fluctuations of various chemical discharges, the outliers are not being eliminated from the dataset.
# Qualitative feature selection
Well, our target parameter is BOD (biological oxygen demand) reported as oxygen, and it only appears in couple Sectors. So it makes sense to explore those sectors for other parameters and choose them as base for feature selection.
a <- unique(subset(dd$`Parameter Name`, dd$Sector == "ELECTRIC POWER GENERATION"))
b <- unique(subset(dd$`Parameter Name`, dd$Sector == "METAL CASTING"))
c <- unique(subset(dd$`Parameter Name`, dd$Sector == "PULP AND PAPER"))
d <- c(a, b, c)
unique(d)## [1] "ALUMINIUM, UNFILTERED TOTAL"
## [2] "FLOW"
## [3] "IRON, UNFILTERED TOTAL"
## [4] "RESIDUE, PARTICULATE"
## [5] "SOLVENT EXTRACTABLES"
## [6] "AMMONIUM+AMMONIA, TOTAL FILTER.REAC"
## [7] "BIOCHEMICAL OXYGEN DEMAND, 5 DAY, TOTAL DEMAND"
## [8] "PHOSPHORUS,UNFILTERED TOTAL"
## [9] "PH (-LOG H+ CONCN)"
## [10] "CARBON, DISSOLVED ORGANIC"
## [11] "CYANIDE, AVAIL, UNFIL.REAC"
## [12] "PHENOLICS, UNFILTERED REACTIVE"
## [13] "ZINC, UNFILTERED TOTAL"
## [14] "CHROMIUM, UNFILTERED TOTAL"
## [15] "COPPER, UNFILTERED TOTAL"
## [16] "SILVER, UNFILTERED TOTAL"
## [17] "CHLOROFORM CHCL3"
## [18] "PHENOL"
## [19] "TOLUENE C7H8"
## [20] "ADSORBABLE ORGANIC HALIDE"
## [21] "2,3,7,8 - T4CDD"
## [22] "2,3,7,8 - T4CDF"
## [23] "TOTAL TOXIC EQUIVALENT"
## [24] "SPECIFIC CONDUCTANCE"
Well, the above parameters are the parameters to explore through for feature selection!
dd_feature <- subset(dd, dd$`Parameter Name`==unique(d))
names(dd_feature)[names(dd_feature)=="Parameter Name"] <- "ParameterName"
names(dd_feature)[names(dd_feature)=="Component Type"] <- "ComponentType"
names(dd_feature)[names(dd_feature)=="Unit of Measure"] <- "Unit"
ddd <- dd_feature %>%
dplyr::select(Sector, Municipality, Date, ParameterName, Frequency, ComponentType, Value, Unit) %>%
distinct() %>%
dplyr::group_by(Date, ParameterName, Unit) %>%
dplyr::summarise(
m = mean(Value, na.rm=TRUE)
) %>%
pivot_wider(names_from = ParameterName, values_from = m)Renaming the columns to avoid future problems!
names(ddd)[names(ddd)=="SOLVENT EXTRACTABLES" ] <- "Solvent"
names(ddd)[names(ddd)=="ALUMINIUM, UNFILTERED TOTAL" ] <- "Aluminum"
names(ddd)[names(ddd)=="IRON, UNFILTERED TOTAL" ] <- "Iron"
names(ddd)[names(ddd)=="PHOSPHORUS,UNFILTERED TOTAL" ] <- "Phosphorus"
names(ddd)[names(ddd)=="RESIDUE, PARTICULATE" ] <- "Residue"
names(ddd)[names(ddd)=="AMMONIUM+AMMONIA, TOTAL FILTER.REAC" ] <- "Ammunium"
names(ddd)[names(ddd)=="BIOCHEMICAL OXYGEN DEMAND, 5 DAY, TOTAL DEMAND" ] <- "BOD"
names(ddd)[names(ddd)=="PH (-LOG H+ CONCN)" ] <- "pH"
names(ddd)[names(ddd)=="CARBON, DISSOLVED ORGANIC" ] <- "Carbon"
names(ddd)[names(ddd)=="COPPER, UNFILTERED TOTAL" ] <- "Copper"
names(ddd)[names(ddd)=="PHENOLICS, UNFILTERED REACTIVE" | names(ddd)=="PHENOL"] <- "Phenol"
names(ddd)[names(ddd)=="ZINC, UNFILTERED TOTAL" ] <- "Zinc"
names(ddd)[names(ddd)=="CHLOROFORM CHCL3" ] <- "Chloroform"
names(ddd)[names(ddd)=="CYANIDE, AVAIL, UNFIL.REAC" ] <- "Cyanide"
names(ddd)[names(ddd)=="TOLUENE C7H8" ] <- "Toluene"
names(ddd)[names(ddd)=="SPECIFIC CONDUCTANCE" ] <- "SpecificConductance"
names(ddd)[names(ddd)=="2,3,7,8 - T4CDD" ] <- "T4CDD"
names(ddd)[names(ddd)=="2,3,7,8 - T4CDF" ] <- "T4CDF"
names(ddd)[names(ddd)=="TOTAL TOXIC EQUIVALENT" ] <- "TotalToxic"
names(ddd)[names(ddd)=="CHROMIUM, UNFILTERED TOTAL" ] <- "Chromium"
names(ddd)[names(ddd)=="SILVER, UNFILTERED TOTAL" ] <- "Silver"
names(ddd)[names(ddd)=="ADSORBABLE ORGANIC HALIDE" ] <- "Halide"
ddd <- ddd[,-12]
names(ddd)## [1] "Date" "Unit" "Halide"
## [4] "Aluminum" "Ammunium" "BOD"
## [7] "Carbon" "Chloroform" "Copper"
## [10] "Cyanide" "FLOW" "Phenol"
## [13] "Phosphorus" "Residue" "Solvent"
## [16] "Toluene" "Zinc" "Iron"
## [19] "Chromium" "Silver" "pH"
## [22] "SpecificConductance" "T4CDD" "T4CDF"
## [25] "TotalToxic"
head(ddd)## # A tibble: 6 x 25
## # Groups: Date [3]
## Date Unit Halide Aluminum Ammunium BOD Carbon Chloroform Copper
## <date> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2004-01-01 KG/D 4 0.00522 393. 117. 7.06 0.193 3.4
## 2 2004-01-01 M3/D NA NA NA NA NA NA NA
## 3 2004-01-31 KG/D NA 10.4 3.61 NA 26.9 NA 3
## 4 2004-01-31 M3/D NA NA NA NA NA NA NA
## 5 2004-02-01 KG/D 293. NA 35.4 216. 909. 0.177 0.000866
## 6 2004-02-01 M3/D NA NA NA NA NA NA NA
## # … with 16 more variables: Cyanide <dbl>, FLOW <dbl>, Phenol <dbl>,
## # Phosphorus <dbl>, Residue <dbl>, Solvent <dbl>, Toluene <dbl>, Zinc <dbl>,
## # Iron <dbl>, Chromium <dbl>, Silver <dbl>, pH <dbl>,
## # SpecificConductance <dbl>, T4CDD <dbl>, T4CDF <dbl>, TotalToxic <dbl>
Great!
ddd_fs <- ddd[, -c(1:2)]
sapply((ddd_fs), class)## Halide Aluminum Ammunium BOD
## "numeric" "numeric" "numeric" "numeric"
## Carbon Chloroform Copper Cyanide
## "numeric" "numeric" "numeric" "numeric"
## FLOW Phenol Phosphorus Residue
## "numeric" "numeric" "numeric" "numeric"
## Solvent Toluene Zinc Iron
## "numeric" "numeric" "numeric" "numeric"
## Chromium Silver pH SpecificConductance
## "numeric" "numeric" "numeric" "numeric"
## T4CDD T4CDF TotalToxic
## "numeric" "numeric" "numeric"
Awesome! Let’s try the first feature selection strategy: Filter-based method: Since the reshaped dataset includes NAs, I first explore the dataset without imputing the NAs using the below lines of code:
corr <- cor(ddd_fs, use="pairwise.complete.obs")
corrplot(corr, na.label = " ")
## NAs in the reshaped dataset
Since, I have created a new dataset, and it contains lots of NAs, let’s explore the NAs again!
This shows the percentages of NAs within each attribute:
#missing values
p <- function(x){sum(is.na(x))/length(x)*100}
apply(ddd_fs, 2, p)## Halide Aluminum Ammunium BOD
## 83.53982 64.95575 62.83186 67.96460
## Carbon Chloroform Copper Cyanide
## 60.35398 71.32743 62.83186 66.72566
## FLOW Phenol Phosphorus Residue
## 59.64602 60.35398 60.35398 56.81416
## Solvent Toluene Zinc Iron
## 56.46018 67.43363 61.59292 75.22124
## Chromium Silver pH SpecificConductance
## 86.19469 94.51327 95.75221 96.46018
## T4CDD T4CDF TotalToxic
## 93.98230 93.98230 93.98230
Imputing missing values by Random Forest
ddd_imp <- missRanger(
ddd_fs,
pmm.k = 3,
num.trees = 1000,
seed=820
)##
## Missing value imputation by random forests
##
## Variables to impute: Halide, Aluminum, Ammunium, BOD, Carbon, Chloroform, Copper, Cyanide, FLOW, Phenol, Phosphorus, Residue, Solvent, Toluene, Zinc, Iron, Chromium, Silver, pH, SpecificConductance, T4CDD, T4CDF, TotalToxic
## Variables used to impute: Halide, Aluminum, Ammunium, BOD, Carbon, Chloroform, Copper, Cyanide, FLOW, Phenol, Phosphorus, Residue, Solvent, Toluene, Zinc, Iron, Chromium, Silver, pH, SpecificConductance, T4CDD, T4CDF, TotalToxic
## iter 1: .......................
## iter 2: .......................
## iter 3: .......................
## iter 4: .......................
## iter 5: .......................
## iter 6: .......................
head(ddd_imp)## # A tibble: 6 x 23
## Halide Aluminum Ammunium BOD Carbon Chloroform Copper Cyanide FLOW
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 4 0.00522 393. 117. 7.06 0.193 3.4 0 11931.
## 2 171. 4 1.34 2933. 16.6 8.67 0.0147 0.241 35044.
## 3 237. 10.4 3.61 655. 26.9 1.08 3 1.02 808.
## 4 3.24 0.496 24.6 31.6 3.02 0.0790 0.00386 0.0269 42763.
## 5 293. 0.00376 35.4 216. 909. 0.177 0.000866 2.11 42370.
## 6 254. 0.0777 3.45 17.4 13.6 0.00931 2.35 0.504 25017.
## # … with 14 more variables: Phenol <dbl>, Phosphorus <dbl>, Residue <dbl>,
## # Solvent <dbl>, Toluene <dbl>, Zinc <dbl>, Iron <dbl>, Chromium <dbl>,
## # Silver <dbl>, pH <dbl>, SpecificConductance <dbl>, T4CDD <dbl>,
## # T4CDF <dbl>, TotalToxic <dbl>
Now, splitting my dataset to training and testing:
require(caTools)
set.seed(820)
sample <- sample.split(ddd_imp$BOD, SplitRatio = 0.75)
ddd_imp_train <- subset(ddd_imp, sample == TRUE)
ddd_imp_test <- subset(ddd_imp, sample == FALSE)shapiro.test(ddd_imp_train$BOD)##
## Shapiro-Wilk normality test
##
## data: ddd_imp_train$BOD
## W = 0.63098, p-value < 2.2e-16
The results from Shapiro-Wilk normality test indicates that the dataset is highly-skewed!
Due to the skewed-dataset, I use Spearman cor test for imputed dataset.
Now, let’s try again the correlation analysis with imputed values! :) This belongs to Filter-based methods family:
corr <- cor(ddd_imp_train, method = "spearman")
corrplot(corr)So, it seems BOD has strong positive correlation with Cyanide, and then Ammunium, Halide, Carbon, Aluminum and negative correlation with Chloroform and Toluene.
To see the numbers using cor analysis:
corr <- cor(ddd_imp_train, method = "spearman")
corrplot(corr, addCoef.col = "black", method = "square", number.cex = 1.5)
So, based on correlation analysis, Flow, Aluminium, Zinc, Chloroform, Silver, Toluene, pH, Residue are amongst the most important contributors.
Let’s perform a wrapper-based feature selection technique:
boruta_result <- Boruta(BOD~., data=ddd_imp_train, , doTrace=0)
print(boruta_result)## Boruta performed 99 iterations in 27.46612 secs.
## 7 attributes confirmed important: Aluminum, Chloroform, FLOW, pH,
## Residue and 2 more;
## 13 attributes confirmed unimportant: Ammunium, Carbon, Chromium,
## Copper, Cyanide and 8 more;
## 2 tentative attributes left: Phenol, Toluene;
To see the full list of the selected parameters:
boruta_result[1]## $finalDecision
## Halide Aluminum Ammunium Carbon
## Rejected Confirmed Rejected Rejected
## Chloroform Copper Cyanide FLOW
## Confirmed Rejected Rejected Confirmed
## Phenol Phosphorus Residue Solvent
## Tentative Rejected Confirmed Rejected
## Toluene Zinc Iron Chromium
## Tentative Confirmed Rejected Rejected
## Silver pH SpecificConductance T4CDD
## Confirmed Confirmed Rejected Rejected
## T4CDF TotalToxic
## Rejected Rejected
## Levels: Tentative Confirmed Rejected
Amazing! :)
# eval <- wrapperEvaluator("knn")
# search <- searchAlgorithm("sequentialForwardSelection")
# res <- featureSelection(ddd_imp_train, "BOD", search, eval)
# res$bestFeaturesNow, I’ll try a Filter-based feature selection technique:
# eval <- filterEvaluator('determinationCoefficient')
# search <- searchAlgorithm("sequentialForwardSelection")
# res <- featureSelection(ddd_imp_train, "BOD", search, eval)
# res$bestFeatures# evaluator_1 <- filterEvaluator('determinationCoefficient')
# evaluator_2 <- filterEvaluator('ReliefFeatureSetMeasure')
#
# hybridSearcher <- hybridSearchAlgorithm('LCC')
#
# results <- hybridFeatureSelection(ddd_imp_train, 'BOD', hybridSearcher, evaluator_1, evaluator_2)
# results$bestFeaturesThe three more methods (please see the above trials) for feature selection have failed - what I learn though is that not all feature selection works for all datasets. depending on different data characteristics (normality/skewness and classtype, etc.) - very useful article: https://machinelearningmastery.com/feature-selection-with-real-and-categorical-data/
and also, https://www.sheffield.ac.uk/polopoly_fs/1.885202!/file/95_Normality_Check.pdf
knitr::include_graphics("/Users/zeinabkhansari/Desktop/cap1.png")
Well, based on the results from Filter-based (Spearman correlation analysis) and Wrapper-based (Boruta) feature selection methods, I am choosing:
Aluminum
Chloroform
FLOW
Residue
Zinc
Silver
pH
as attributes that impacts the BOD (Biological Oxygen Demand)!